Retrieving the Data

In [6]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt #plot matplotlib
import seaborn as sns #plot seaborn
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
In [7]:
import os
print(os.listdir("C:/Users/Romansya/Home Credit Risk/input"))
['app_test.csv', 'app_train.csv', 'columns_description.csv', 'installment_payment.csv', 'prev_app.csv']
In [8]:
app_test = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/app_test.csv")
app_train = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/app_train.csv")
installment_payment = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/installment_payment.csv")
prev_app = pd.read_csv("C:/Users/Romansya/Home Credit Risk/input/prev_app.csv")
In [9]:
print('Size of app_test data', app_test.shape)
print('Size of app_train data', app_train.shape)
print('Size of installment_payment data', installment_payment.shape)
print('Size of prev_app data', prev_app.shape)
Size of app_test data (14761, 24)
Size of app_train data (61503, 24)
Size of installment_payment data (2872306, 8)
Size of prev_app data (350712, 19)

Quick look of Data

In [10]:
app_test.head()
Out[10]:
Unnamed: 0 LN_ID TARGET CONTRACT_TYPE GENDER NUM_CHILDREN INCOME APPROVED_CREDIT ANNUITY PRICE ... DAYS_AGE DAYS_WORK DAYS_REGISTRATION DAYS_ID_CHANGE WEEKDAYS_APPLY HOUR_APPLY ORGANIZATION_TYPE EXT_SCORE_1 EXT_SCORE_2 EXT_SCORE_3
0 102590 219092 0 Cash loans M 3 135000.0 871029.0 44604.0 765000.0 ... -17598 -2650 -1411.0 -1131 SATURDAY 7 Business Entity Type 3 NaN 0.145475 0.651260
1 35895 141577 0 Cash loans F 0 144000.0 485640.0 34537.5 450000.0 ... -14097 -7408 -7908.0 -4872 MONDAY 14 Kindergarten NaN 0.682675 NaN
2 69154 180205 0 Cash loans F 1 90000.0 247500.0 8887.5 247500.0 ... -18384 -2826 -8226.0 -1930 SATURDAY 12 Self-employed 0.814700 0.686312 0.758393
3 222185 357381 0 Cash loans M 2 112500.0 506889.0 24781.5 418500.0 ... -12170 -926 -916.0 -4048 THURSDAY 13 Other 0.399219 0.266520 0.058826
4 147680 271229 0 Cash loans M 0 216000.0 450000.0 21888.0 450000.0 ... -10790 -577 -4640.0 -2035 MONDAY 14 Business Entity Type 3 0.368452 0.610483 0.392774

5 rows × 24 columns

In [11]:
app_train.head()
Out[11]:
Unnamed: 0 LN_ID TARGET CONTRACT_TYPE GENDER NUM_CHILDREN INCOME APPROVED_CREDIT ANNUITY PRICE ... DAYS_AGE DAYS_WORK DAYS_REGISTRATION DAYS_ID_CHANGE WEEKDAYS_APPLY HOUR_APPLY ORGANIZATION_TYPE EXT_SCORE_1 EXT_SCORE_2 EXT_SCORE_3
0 201468 333538 0 Revolving loans F 1 67500.0 202500.0 10125.0 202500.0 ... -11539 -921 -119.0 -2757 TUESDAY 18 Business Entity Type 3 0.572805 0.608276 NaN
1 264803 406644 0 Cash loans F 1 202500.0 976711.5 49869.0 873000.0 ... -15743 -4482 -1797.0 -2455 TUESDAY 14 Other 0.655600 0.684298 NaN
2 137208 259130 0 Cash loans F 0 180000.0 407520.0 25060.5 360000.0 ... -20775 365243 -8737.0 -4312 THURSDAY 14 NA1 NaN 0.580687 0.749022
3 269220 411997 0 Cash loans M 0 225000.0 808650.0 26086.5 675000.0 ... -20659 -10455 -4998.0 -4010 WEDNESDAY 10 Culture NaN 0.623740 0.710674
4 122096 241559 0 Revolving loans M 0 135000.0 180000.0 9000.0 180000.0 ... -9013 -1190 -3524.0 -1644 SUNDAY 11 Construction 0.175511 0.492994 0.085595

5 rows × 24 columns

installment_payment.head()

In [13]:
prev_app.head()
Out[13]:
Unnamed: 0 SK_ID_PREV LN_ID CONTRACT_TYPE ANNUITY APPLICATION APPROVED_CREDIT AMT_DOWN_PAYMENT PRICE WEEKDAYS_APPLY HOUR_APPLY CONTRACT_STATUS DAYS_DECISION TERM_PAYMENT YIELD_GROUP FIRST_DRAW FIRST_DUE TERMINATION NFLAG_INSURED_ON_APPROVAL
0 0 2030495 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15 Approved -73 12.0 middle 365243.0 -42.0 -37.0 0.0
1 3 2819243 176158 Cash loans 47041.335 450000.0 470790.0 NaN 450000.0 MONDAY 7 Approved -512 12.0 middle 365243.0 -482.0 -177.0 1.0
2 5 1383531 199383 Cash loans 23703.930 315000.0 340573.5 NaN 315000.0 SATURDAY 8 Approved -684 18.0 low_normal 365243.0 -654.0 -137.0 1.0
3 6 2315218 175704 Cash loans NaN 0.0 0.0 NaN NaN TUESDAY 11 Canceled -14 NaN NA1 NaN NaN NaN NaN
4 10 1715995 447712 Cash loans 11368.620 270000.0 335754.0 NaN 270000.0 FRIDAY 7 Approved -735 54.0 low_normal 365243.0 -705.0 -334.0 1.0

Checking for missing data

In [14]:
# check in app_test
total = app_test.isnull().sum().sort_values(ascending = False)
percent = (app_test.isnull().sum()/app_test.isnull().count()*100).sort_values(ascending = False)
missing_app_test_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_app_test_data.head()
Out[14]:
Total Percent
EXT_SCORE_1 8312 56.310548
EXT_SCORE_3 2919 19.775083
PRICE 20 0.135492
EXT_SCORE_2 19 0.128718
INCOME_TYPE 0 0.000000
In [15]:
# check in app_train
total = app_train.isnull().sum().sort_values(ascending = False)
percent = (app_train.isnull().sum()/app_train.isnull().count()*100).sort_values(ascending = False)
missing_app_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_app_train_data.head()
Out[15]:
Total Percent
EXT_SCORE_1 34845 56.655773
EXT_SCORE_3 12239 19.899842
EXT_SCORE_2 134 0.217876
PRICE 62 0.100808
ANNUITY 1 0.001626
In [16]:
# check in installment_payment
total = installment_payment.isnull().sum().sort_values(ascending = False)
percent = (installment_payment.isnull().sum()/installment_payment.isnull().count()*100).sort_values(ascending = False)
missing_installment_payment_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_installment_payment_data.head()
Out[16]:
Total Percent
AMT_PAY 673 0.023431
PAY_DAYS 673 0.023431
AMT_INST 0 0.000000
INST_DAYS 0 0.000000
INST_NUMBER 0 0.000000
In [17]:
# check in prev_app
total = prev_app.isnull().sum().sort_values(ascending = False)
percent = (prev_app.isnull().sum()/prev_app.isnull().count()*100).sort_values(ascending = False)
missing_prev_app_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_prev_app_data.head()
Out[17]:
Total Percent
AMT_DOWN_PAYMENT 186507 53.179532
NFLAG_INSURED_ON_APPROVAL 139305 39.720625
FIRST_DUE 139305 39.720625
FIRST_DRAW 139305 39.720625
TERMINATION 139305 39.720625

Exploration - Preparation

In [38]:
from plotly import tools

def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1] 
    xx = cnt_srs.head(limit).values[::-1] 
    if rev:
        yy = cnt_srs.tail(limit).index[::-1] 
        xx = cnt_srs.tail(limit).values[::-1] 
    if xlb:
        trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
    if return_trace:
        return trace 
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

def bar_hor_noagg(x, y, title, color, w=None, h=None, lm=0, limit=100, rt=False):
    trace = go.Bar(y=x, x=y, orientation = 'h', marker=dict(color=color))
    if rt:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)


def bar_ver_noagg(x, y, title, color, w=None, h=None, lm=0, rt = False):
    trace = go.Bar(y=y, x=x, marker=dict(color=color))
    if rt:
        return trace
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
def gp(col, title):
    df1 = app_train[app_train["TARGET"] == 1]
    df0 = app_train[app_train["TARGET"] == 0]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(app_train[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]

    trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#44ff54"))
    trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#ff4444"))
    return trace1, trace2

Exploratory - app_train

In [29]:
# Target Variable Distribution 
bar_hor(app_train, "TARGET", "Distribution of Target Variable" , ["#44ff54", '#ff4444'], h=350, w=600, lm=200, xlb = ['Target : 1','Target : 0'])

The Target variable is slightly imbalance with the majority of loans has the target equals to 0 which indicates that individuals did not had any problems in paying installments in given time.

In [39]:
tr0 = bar_hor(app_train, "GENDER", "Distribution of GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('GENDER', 'Distribution of Target with Applicant Gender')

fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Target=1" ,"Gender, Target=0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
iplot(fig);

In the applicant's data women have applied for a larger majority of loans which is almost the double as the men. And a larger percentage (about 10% of the total) of men had the problems in paying the loan or making installments within time as compared to women applicants (about 7%).

In [40]:
tr0 = bar_hor(app_train, "FAMILY_STATUS", "Distribution of FAMILY_STATUS Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('FAMILY_STATUS', 'Distribution of Target with Applicant Family Status')

fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Family Status Distribution" , "Family Status, Target = 1" ,"Family Status, Target = 0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);

Married people have applied for a larger number of loan applications. However, people having Civil Marriage has the highest percentage (about 10%) of loan problems and challenges.

In [41]:
tr1 = bar_hor(app_train, "EDUCATION", "Distribution of Applicant's Education" ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(app_train, "HOUSING_TYPE", "Distribution of Applicant's House Types" ,"#f975ae", w=700, lm=100, return_trace = True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Education Type', 'Applicants Housing Type' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);


tr1, tr2 = gp('EDUCATION', 'Applicants Education Types which repayed the loan')
tr3, tr4 = gp('HOUSING_TYPE', 'Applicants Housing Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ["Applicants Education Types, Target=1", "Applicants Housing Type, Target=1"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);

A large number of applications (44K) are filed by people having secondary education followed by people with Higher Education with 15K applications. Applicants living in House / apartments has the highest number of loan apllications equal to 55K. While we see that the applicants with Lower Secondary education status has the highest percentage of payment related problems. Also, Applicants living in apartments or living with parents also shows the same trend.

In [42]:
tr1 = bar_hor(app_train, "INCOME_TYPE", "Distribution of INCOME_TYPE Variable" ,"#f975ae", w=700, lm=100, return_trace= True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Income Type'])
fig.append_trace(tr1, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);

The income type of people who applies for loan include about 8 categroes, top ones are :

  • Working Class (32K)
  • Commercial Associate (14K)
  • Pensiner (11K)
In [54]:
t = app_train['CONTRACT_TYPE'].value_counts()
labels = t.index
values = t.values
colors = ['#FEBFB3','#96D38C']
trace = go.Pie(labels=labels, values=values,
               hoverinfo='all', textinfo='none',
               textfont=dict(size=12),
               marker=dict(colors=colors,
                           line=dict(color='#fff', width=2)))
layout = go.Layout(title='Applicants Contract Type', height=400)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

Cash loans with about 56K loans contributes to a majorty of total lonas in this dataset. Revolving loans has significantly lesser number equal to about 5K as compared to Cash loans.

In [56]:
tr1 = bar_hor(app_train, "ORGANIZATION_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Organization Type'])
fig.append_trace(tr1, 1, 1);
fig['layout'].update(height=600, showlegend=False, margin=dict(l=150));
iplot(fig);
In [57]:
tr1, tr2 = gp('ORGANIZATION_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ["Applicants Organization Types - Repayed"])
fig.append_trace(tr1, 1, 1);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
In [58]:
plt.figure(figsize=(12,5))
plt.title("Distribution of APPROVED_CREDIT")
ax = sns.distplot(app_train["APPROVED_CREDIT"])
D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

In [59]:
plt.figure(figsize=(12,5))
plt.title("Distribution of ANNUITY")
ax = sns.distplot(app_train["ANNUITY"].dropna())
In [60]:
plt.figure(figsize=(12,5))
plt.title("Distribution of PRICE")
ax = sns.distplot(app_train["PRICE"].dropna())
In [61]:
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_AGE")
ax = sns.distplot(app_train["DAYS_AGE"])
In [62]:
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_WORK")
ax = sns.distplot(app_train["DAYS_WORK"])
In [63]:
plt.figure(figsize=(12,5))
plt.title("Distribution of DAYS_REGISTRATION")
ax = sns.distplot(app_train["DAYS_REGISTRATION"])
In [64]:
t = app_train["NUM_CHILDREN"].value_counts()
t1 = pd.DataFrame()
t1['x'] = t.index 
t1['y'] = t.values 

plt.figure(figsize=(12,5));
plt.title("Distribution of Applicant's Number of Children");
ax = sns.barplot(data=t1, x="x", y="y", color="#f975ae");
ax.spines['right'].set_visible(False);
ax.spines['top'].set_visible(False);

ax.set_ylabel('');    
ax.set_xlabel('');

Exploration - prev_app

In [67]:
t = prev_app['CONTRACT_STATUS'].value_counts()
labels = t.index
values = t.values

colors = ['#96D38C', '#E1396C', '#FEBFB3', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='all', textinfo='none',
               textfont=dict(size=12),
               marker=dict(colors=colors,
                           line=dict(color='#fff', width=2)))

layout = go.Layout(title='Name Contract Status in Previous Applications', height=400)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

A large number of people (about 63%) had their previous applications approved, while about 18% of them had cancelled and other 17% were refused.

Model

Preparation

In [83]:
from sklearn.model_selection import train_test_split 
import lightgbm as lgb

# read the test files 
app_test = pd.read_csv('C:/Users/Romansya/Home Credit Risk/input/app_test.csv')

app_test['is_test'] = 1 
app_test['is_train'] = 0
app_train['is_test'] = 0
app_train['is_train'] = 1

# target variable
Y = app_train['TARGET']
train_X = app_train.drop(['TARGET'], axis = 1)

# test ID
test_id = app_test['LN_ID']
test_X = app_test

# merge train and test datasets for preprocessing
data = pd.concat([train_X, test_X], axis=0, sort=True)
In [86]:
# function to obtain Categorical Features
def _get_categorical_features(df):
    feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return feats

# function to factorize categorical features
def _factorize_categoricals(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

# function to create dummy variables of categorical features
def _get_dummies(df, cats):
    for col in cats:
        df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    return df 

# get categorical features
data_cats = _get_categorical_features(data)
prev_app_cats = _get_categorical_features(prev_app)

# create additional dummy features - 
prev_app = _get_dummies(prev_app, prev_app_cats)

# factorize the categorical features from train and test data
data = _factorize_categoricals(data, data_cats)

Feature Engineering

prev_app

In [88]:
# count the number of previous applications for a given ID
prev_apps_count = prev_app[['LN_ID', 'SK_ID_PREV']].groupby('LN_ID').count()
prev_app['SK_ID_PREV'] = prev_app['LN_ID'].map(prev_apps_count['SK_ID_PREV'])

# Average values for all other features in previous applications
prev_apps_avg = prev_app.groupby('LN_ID').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]
data = data.merge(right=prev_apps_avg.reset_index(), how='left', on='LN_ID')

installment_payment

In [90]:
## count the number of previous installments
cnt_inst = installment_payment[['LN_ID', 'SK_ID_PREV']].groupby('LN_ID').count()
installment_payment['SK_ID_PREV'] = installment_payment['LN_ID'].map(cnt_inst['SK_ID_PREV'])

## Average values for all other variables in installments payments
avg_inst = installment_payment.groupby('LN_ID').mean()
avg_inst.columns = ['i_' + f_ for f_ in avg_inst.columns]
data = data.merge(right=avg_inst.reset_index(), how='left', on='LN_ID')

Preparing final app_train and app_test

In [91]:
ignore_features = ['LN_ID', 'is_train', 'is_test']
relevant_features = [col for col in data.columns if col not in ignore_features]
trainX = data[data['is_train'] == 1][relevant_features]
testX = data[data['is_test'] == 1][relevant_features]

Validation Sets

In [92]:
x_train, x_val, y_train, y_val = train_test_split(trainX, Y, test_size=0.2, random_state=18)
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_eval = lgb.Dataset(data=x_val, label=y_val)

Fitting

In [93]:
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)
D:\Anaconda\lib\site-packages\lightgbm\engine.py:113: UserWarning:

Found `num_iteration` in params. Will use it instead of argument

Training until validation scores don't improve for 150 rounds.
[200]	valid_0's auc: 0.755616
[400]	valid_0's auc: 0.764124
[600]	valid_0's auc: 0.765327
Early stopping, best iteration is:
[595]	valid_0's auc: 0.765412

Feature Importance

In [94]:
lgb.plot_importance(model, figsize=(12, 25), max_num_features=100);

Predicting

In [95]:
pred = model.predict(testX)
sub_lgb = pd.DataFrame()
sub_lgb['LN_ID'] = test_id
sub_lgb['TARGET'] = pred
sub_lgb.to_csv("hc_try_lgb_baseline.csv", index=False)
sub_lgb.head()
Out[95]:
LN_ID TARGET
0 219092 0.103528
1 141577 0.029865
2 180205 0.018519
3 357381 0.432418
4 271229 0.066454

Result Check

Preparation

In [96]:
sub_lgb.head()
Out[96]:
LN_ID TARGET
0 219092 0.103528
1 141577 0.029865
2 180205 0.018519
3 357381 0.432418
4 271229 0.066454
In [99]:
thresh = 0.5
sub_lgb['PREDICTED_TARGET'] = (sub_lgb.TARGET >= 0.5).astype('int')
sub_lgb.head()
Out[99]:
LN_ID TARGET REAL_TARGET PREDICTED_TARGET
0 219092 0.103528 0 0
1 141577 0.029865 0 0
2 180205 0.018519 0 0
3 357381 0.432418 0 0
4 271229 0.066454 0 0
In [100]:
sub_lgb['REAL_TARGET'] = app_test['TARGET']
sub_lgb.head()
Out[100]:
LN_ID TARGET REAL_TARGET PREDICTED_TARGET
0 219092 0.103528 0 0
1 141577 0.029865 0 0
2 180205 0.018519 0 0
3 357381 0.432418 0 0
4 271229 0.066454 0 0
In [102]:
from sklearn.metrics import confusion_matrix
confusion_matrix(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[102]:
array([[13545,    13],
       [ 1183,    20]], dtype=int64)
In [103]:
def find_TP(y_true, y_pred):
    # counts the number of true positives (y_true = 1, y_pred = 1)
    return sum((y_true == 1) & (y_pred == 1))
def find_FN(y_true, y_pred):
    # counts the number of false negatives (y_true = 1, y_pred = 0)
    return sum((y_true == 1) & (y_pred == 0))
def find_FP(y_true, y_pred):
    # counts the number of false positives (y_true = 0, y_pred = 1)
    return sum((y_true == 0) & (y_pred == 1))
def find_TN(y_true, y_pred):
    # counts the number of true negatives (y_true = 0, y_pred = 0)
    return sum((y_true == 0) & (y_pred == 0))
In [104]:
print('TP:',find_TP(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('FN:',find_FN(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('FP:',find_FP(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
print('TN:',find_TN(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values))
TP: 20
FN: 1183
FP: 13
TN: 13545
In [105]:
import numpy as np
def find_conf_matrix_values(y_true,y_pred):
    # calculate TP, FN, FP, TN
    TP = find_TP(y_true,y_pred)
    FN = find_FN(y_true,y_pred)
    FP = find_FP(y_true,y_pred)
    TN = find_TN(y_true,y_pred)
    return TP,FN,FP,TN
def my_confusion_matrix(y_true, y_pred):
    TP,FN,FP,TN = find_conf_matrix_values(y_true,y_pred)
    return np.array([[TN,FP],[FN,TP]])
In [106]:
my_confusion_matrix(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[106]:
array([[13545,    13],
       [ 1183,    20]])

Accuracy

In [107]:
from sklearn.metrics import accuracy_score
accuracy_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[107]:
0.9189756791545288

Recall

In [108]:
from sklearn.metrics import recall_score
recall_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[108]:
0.01662510390689942

Precision

In [109]:
from sklearn.metrics import precision_score
precision_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[109]:
0.6060606060606061

F1 Score

In [110]:
from sklearn.metrics import f1_score
f1_score(sub_lgb.REAL_TARGET.values, sub_lgb.PREDICTED_TARGET.values)
Out[110]:
0.032362459546925564